Create a crawler

POST

crawlers

curl --request POST \
  --url https://crawler.algolia.com/api/1/crawlers \
  --header 'Authorization: Basic <encoded-value>' \
  --header 'Content-Type: application/json' \
  --data '{
  "name": "test-crawler",
  "config": {
    "actions": [
      {
        "autoGenerateObjectIDs": true,
        "cache": {
          "enabled": true
        },
        "discoveryPatterns": [
          "https://www.algolia.com/**"
        ],
        "fileTypesToMatch": [
          "html",
          "pdf"
        ],
        "hostnameAliases": {
          "dev.example.com": "example.com"
        },
        "indexName": "algolia_website",
        "name": "<string>",
        "pathAliases": {
          "example.com": {
            "/foo": "/bar"
          }
        },
        "pathsToMatch": [
          "https://www.algolia.com/**"
        ],
        "recordExtractor": {
          "__type": "function",
          "source": "<string>"
        },
        "schedule": "<string>",
        "selectorsToMatch": [
          ".products",
          "!.featured"
        ]
      }
    ],
    "apiKey": "<string>",
    "appId": "<string>",
    "exclusionPatterns": [
      "https://www.example.com/excluded",
      "!https://www.example.com/this-one-url",
      "https://www.example.com/exclude/**"
    ],
    "externalData": [
      "testCSV"
    ],
    "extraUrls": [
      "<string>"
    ],
    "ignoreCanonicalTo": true,
    "ignoreNoFollowTo": true,
    "ignoreNoIndex": true,
    "ignorePaginationAttributes": true,
    "ignoreQueryParams": [
      "ref",
      "utm_*"
    ],
    "ignoreRobotsTxtRules": true,
    "indexPrefix": "crawler_",
    "initialIndexSettings": {},
    "linkExtractor": {
      "__type": "function",
      "source": "({ $, url, defaultExtractor }) => {\n  if (/example.com\\/doc\\//.test(url.href)) {\n    // For all pages under `/doc`, only extract the first found URL.\n    return defaultExtractor().slice(0, 1)\n  }\n  // For all other pages, use the default.\n  return defaultExtractor()\n}\n"
    },
    "login": {
      "url": "https://example.com/secure/login-with-post",
      "requestOptions": {
        "method": "POST",
        "headers": {
          "Content-Type": "application/x-www-form-urlencoded"
        },
        "body": "id=my-id&password=my-password",
        "timeout": 5000
      }
    },
    "maxDepth": 5,
    "maxUrls": 250,
    "rateLimit": 4,
    "renderJavaScript": true,
    "requestOptions": {
      "proxy": "<string>",
      "timeout": 30000,
      "retries": 3,
      "headers": {
        "Accept-Language": "fr-FR",
        "Authorization": "Bearer Aerehdf==",
        "Cookie": "session=1234"
      }
    },
    "safetyChecks": {
      "beforeIndexPublishing": {
        "maxLostRecordsPercentage": 10,
        "maxFailedUrls": 123
      }
    },
    "saveBackup": true,
    "schedule": "every weekday at 12:00 pm",
    "sitemaps": [
      "https://example.com/sitemap.xyz"
    ],
    "startUrls": [
      "https://www.example.com"
    ]
  }
}'

{
  "id": "e0f6db8a-24f5-4092-83a4-1b2c6cb6d809"
}

Authorizations

Authorization

string

header

required

Basic authentication header of the form Basic <encoded-value>, where <encoded-value> is the base64-encoded string username:password.

Body

application/json

Response

200

application/json

The response is of type object.

Test crawl a URL Retrieve crawler details

curl --request POST \
  --url https://crawler.algolia.com/api/1/crawlers \
  --header 'Authorization: Basic <encoded-value>' \
  --header 'Content-Type: application/json' \
  --data '{
  "name": "test-crawler",
  "config": {
    "actions": [
      {
        "autoGenerateObjectIDs": true,
        "cache": {
          "enabled": true
        },
        "discoveryPatterns": [
          "https://www.algolia.com/**"
        ],
        "fileTypesToMatch": [
          "html",
          "pdf"
        ],
        "hostnameAliases": {
          "dev.example.com": "example.com"
        },
        "indexName": "algolia_website",
        "name": "<string>",
        "pathAliases": {
          "example.com": {
            "/foo": "/bar"
          }
        },
        "pathsToMatch": [
          "https://www.algolia.com/**"
        ],
        "recordExtractor": {
          "__type": "function",
          "source": "<string>"
        },
        "schedule": "<string>",
        "selectorsToMatch": [
          ".products",
          "!.featured"
        ]
      }
    ],
    "apiKey": "<string>",
    "appId": "<string>",
    "exclusionPatterns": [
      "https://www.example.com/excluded",
      "!https://www.example.com/this-one-url",
      "https://www.example.com/exclude/**"
    ],
    "externalData": [
      "testCSV"
    ],
    "extraUrls": [
      "<string>"
    ],
    "ignoreCanonicalTo": true,
    "ignoreNoFollowTo": true,
    "ignoreNoIndex": true,
    "ignorePaginationAttributes": true,
    "ignoreQueryParams": [
      "ref",
      "utm_*"
    ],
    "ignoreRobotsTxtRules": true,
    "indexPrefix": "crawler_",
    "initialIndexSettings": {},
    "linkExtractor": {
      "__type": "function",
      "source": "({ $, url, defaultExtractor }) => {\n  if (/example.com\\/doc\\//.test(url.href)) {\n    // For all pages under `/doc`, only extract the first found URL.\n    return defaultExtractor().slice(0, 1)\n  }\n  // For all other pages, use the default.\n  return defaultExtractor()\n}\n"
    },
    "login": {
      "url": "https://example.com/secure/login-with-post",
      "requestOptions": {
        "method": "POST",
        "headers": {
          "Content-Type": "application/x-www-form-urlencoded"
        },
        "body": "id=my-id&password=my-password",
        "timeout": 5000
      }
    },
    "maxDepth": 5,
    "maxUrls": 250,
    "rateLimit": 4,
    "renderJavaScript": true,
    "requestOptions": {
      "proxy": "<string>",
      "timeout": 30000,
      "retries": 3,
      "headers": {
        "Accept-Language": "fr-FR",
        "Authorization": "Bearer Aerehdf==",
        "Cookie": "session=1234"
      }
    },
    "safetyChecks": {
      "beforeIndexPublishing": {
        "maxLostRecordsPercentage": 10,
        "maxFailedUrls": 123
      }
    },
    "saveBackup": true,
    "schedule": "every weekday at 12:00 pm",
    "sitemaps": [
      "https://example.com/sitemap.xyz"
    ],
    "startUrls": [
      "https://www.example.com"
    ]
  }
}'

{
  "id": "e0f6db8a-24f5-4092-83a4-1b2c6cb6d809"
}

Tools

Crawler

Authorizations

Body

Response